vignettes/2_usage.Rmd
2_usage.Rmdand then use the seurat object as input of SciGeneX.
SciGeneX assumes that the normalized count matrix is saved in the “data” slot of seurat object and can be loaded with GetAssayData(object, slot="data). If not or if you didn’t use seurat, you can also provide a normalized matrix as input.
For this tutorial, we will use scRNA-seq dataset of Peripheral Blood Mononuclear Cells (PBMC) freely available on 10x Genomics. This dataset contains 2700 single cells sequenced on the Illumina NextSeq 50. You can download it here.
In this step, we will run the pre-processing steps from common seurat pipeline. If you already have your pre-processed seurat object or your normalized count matrix, you can skip this step.
library(Seurat)
# _______________
# Setup the Seurat object
# ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
# Read 10x data
pbmc <- Read10X(data.dir = "../pbmc3k/filtered_gene_bc_matrices/hg19")
# Create Seurat object
pbmc_seurat <- CreateSeuratObject(counts = pbmc, project = "pbmc3k",
min.cells = 3,
min.features = 200)## An object of class Seurat
## 13714 features across 2700 samples within 1 assay
## Active assay: RNA (13714 features, 0 variable features)
We run here the classical steps of the seurat pipeline. For more information, you can check their website here.
# _______________________
# Run the classical pipeline of Seurat
# ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
# Quality control
pbmc_seurat[["percent.mt"]] <- PercentageFeatureSet(pbmc_seurat,
pattern = "^MT-")
pbmc_seurat <- subset(pbmc_seurat,
subset = percent.mt < 5 & nFeature_RNA > 200)
# Normalizing data
pbmc_seurat <- NormalizeData(pbmc_seurat)
# Identification of highly variable genes
pbmc_seurat <- FindVariableFeatures(pbmc_seurat,
selection.method = "vst",
nfeatures = 2000)
# Scaling data
pbmc_seurat <- ScaleData(pbmc_seurat,
features = rownames(pbmc_seurat))
# Perform principal component analysis
pbmc_seurat <- RunPCA(pbmc_seurat,
features = VariableFeatures(object = pbmc_seurat))
# Cell clustering
pbmc_seurat <- FindNeighbors(pbmc_seurat, dims = 1:10)
pbmc_seurat <- FindClusters(pbmc_seurat, resolution = 0.5)
# Dimension reduction
pbmc_seurat <- RunUMAP(pbmc_seurat, dims = 1:10)
DimPlot(pbmc_seurat, reduction = "umap")
library(scigenex)
# Run DBFMCL
pbmc_scigenex <- DBFMCL(data = pbmc_seurat,
k = 80,
inflation = 2,
min_cluster_size = 5,
av_dot_prod_min = 2)## The following parameters will be used :
## Working directory: /mnt/NAS7/Workspace/bavaisj/dbfmcl/scigenex/vignettes
## Ouput directory: /mnt/NAS7/Workspace/bavaisj/dbfmcl/scigenex/vignettes
## Name: g4YcH8n63K
## Distance method: pearson
## Minimum average dot product for clusters: 2
## Minimum cluster size: 5
## Number of neighbors: 80
## FDR: 10 %
## Inflation: 2
## Visualize standard outputs from both mcl and cluster commands: FALSE
##
## |-- DBF completed. Starting MCL step.
## Running mcl (graph partitioning)...
## |-- Done
## |-- creating file : /mnt/NAS7/Workspace/bavaisj/dbfmcl/scigenex/vignettes/g4YcH8n63K.mcl_out.txt
## |-- Reading and filtering MCL output: /mnt/NAS7/Workspace/bavaisj/dbfmcl/scigenex/vignettes/g4YcH8n63K.mcl_out.txt
## |-- 7 clusters conserved after MCL partitioning.
## |-- 48 clusters filtered out from MCL partitioning (size and mean dot product).
Clusters of genes are stored in the slot gene_patterns. You can access the gene names from a cluster using the command get_genes().
# Extract gene names from all cluster
genes <- get_genes(pbmc_scigenex, cluster = "all")
head(genes)## [1] "TYROBP" "CST3" "LST1" "AIF1" "FCN1" "FTL"
# Extract gene names from gene cluster 5
genes_5 <- get_genes(pbmc_scigenex, cluster = 5)
genes_5## [1] "UBE2C" "AURKB" "DEPDC1B" "BIRC5"
## [5] "KIFC1" "RRM2" "TYMS" "KIAA0101"
## [9] "CENPA" "CCNB2" "MKI67" "HMMR"
## [13] "MYBL2" "TOP2A" "CDT1" "CKAP2L"
## [17] "ASPM" "CEP55" "TK1" "KIF2C"
## [21] "UCHL1" "TROAP" "PBK" "CDC6"
## [25] "CIT" "PKMYT1" "CDK1" "CLSPN"
## [29] "CCNA2" "ZWINT" "EPDR1" "CDC20"
## [33] "CDCA8" "CENPF" "TRAIP" "CCNB1"
## [37] "RP5-1074L1.4" "TRIP13" "CDC45" "BRCA1"
## [41] "SLC4A4" "MTFR2" "FOXM1" "PEBP4"
## [45] "GTSE1" "CAV1" "FAM92A1" "MCM10"
## [49] "NUSAP1" "RAD51" "GINS2" "NUF2"
## [53] "TPX2" "PLK1" "RP11-160H22.5" "TRBV11-2"
## [57] "SGOL1" "CYSLTR2" "OIP5" "GVQW1"
## [61] "NCAPG2" "CCDC15" "ASF1B" "GPSM2"
## [65] "PHGDH" "KCNAB3" "OSGEPL1-AS1" "CDCA7"
## [69] "CENPM" "MXD3" "KIAA1524" "HMGB3"
## [73] "CENPW" "CDCA5" "CDC25A" "MYCBPAP"
## [77] "KCNC4" "WDR62" "HIST1H1B" "C21orf58"
## [81] "PAQR4" "SPAG5" "APOBEC3H" "NEK2"
## [85] "ESCO2" "DSCC1" "DONSON" "MCM2"
## [89] "POC1A" "NCAPH" "CHEK1" "ORC6"
## [93] "RMI2" "C3orf14" "RACGAP1" "CCDC136"
## [97] "IFT140" "POLR3B" "OSR2" "ARHGAP11A"
## [101] "RP11-1348G14.5" "AKR1E2" "AVEN"
You can also used the function top_genes to extract the top genes of each cluster of genes. Genes are ranked regarding their mean correlation values with genes in a same cluster and the top genes of each cluster are stored in the slot top_genes of the ClusterSet object.
# Find top 10 genes for all gene clusters
pbmc_scigenex <- top_genes(pbmc_scigenex, top = 10)
pbmc_scigenex@top_genes## |-- Results are stored in top_genes slot of ClusterSet object.
## gene_top_1 gene_top_2 gene_top_3 gene_top_4 gene_top_5 gene_top_6
## cluster_1 "CST3" "TYROBP" "LST1" "AIF1" "FCER1G" "FTL"
## cluster_2 "PTCRA" "SDPR" "GNG11" "PF4" "GP9" "TMEM40"
## cluster_3 "RPS12" "RPS6" "RPS27" "RPL13A" "RPS18" "RPL3"
## cluster_4 "NKG7" "GZMB" "PRF1" "CST7" "FGFBP2" "GZMA"
## cluster_5 "BIRC5" "RRM2" "KIFC1" "TYMS" "MKI67" "KIAA0101"
## cluster_6 "CD79A" "HLA-DQA1" "MS4A1" "CD79B" "HLA-DQB1" "TCL1A"
## cluster_7 "LRRC26" "CLEC4C" "LILRA4" "SERPINF1" "SCT" "TIFAB"
## gene_top_7 gene_top_8 gene_top_9 gene_top_10
## cluster_1 "TYMP" "FCN1" "FTH1" "LYZ"
## cluster_2 "ITGA2B" "PPBP" "C2orf88" "SPARC"
## cluster_3 "RPL13" "RPS3" "RPL23A" "RPS25"
## cluster_4 "GNLY" "CTSW" "SPON2" "GZMH"
## cluster_5 "CCNB2" "HMMR" "AURKB" "DEPDC1B"
## cluster_6 "CD74" "LINC00926" "VPREB3" "HLA-DQA2"
## cluster_7 "AC011893.3" "PLS3" "SMPD3" "GAS6"
You can acess the top genes of a specific cluster of genes using the get_genes function.
# Extract gene names from the top 10 genes of cluster 5
genes_cl5_top <- get_genes(pbmc_scigenex, cluster = 5, top = TRUE)
genes_cl5_top## [1] "BIRC5" "RRM2" "KIFC1" "TYMS" "MKI67" "KIAA0101"
## [7] "CCNB2" "HMMR" "AURKB" "DEPDC1B"
At the end of this step, you can use the genes found by SciGeneX as feature selection results. In the followed example, those genes are stored in the Seurat object and we re-run the Seurat pipeline using those genes.
# _______________________
# Run the classical pipeline of Seurat
# ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
# Use genes found by SciGeneX as feature selection results
pbmc_seurat@assays$RNA@var.features <- get_genes(pbmc_scigenex)
# Scaling data
pbmc_seurat <- ScaleData(pbmc_seurat,
features = rownames(pbmc_seurat))
# Perform principal component analysis
pbmc_seurat <- RunPCA(pbmc_seurat,
features = VariableFeatures(object = pbmc_seurat))
# Cell clustering
pbmc_seurat <- FindNeighbors(pbmc_seurat, dims = 1:10)
pbmc_seurat <- FindClusters(pbmc_seurat, resolution = 0.5)
# Dimension reduction
pbmc_seurat <- RunUMAP(pbmc_seurat, dims = 1:10)
DimPlot(pbmc_seurat, reduction = "umap")
cell_cluster <- pbmc_seurat[["seurat_clusters"]]
cell_cluster <- rownames(cell_cluster)[order(cell_cluster[,"seurat_clusters"])]
plot_heatmap(pbmc_scigenex,
cell_order = cell_cluster,
use_top_genes = TRUE,
line_size = 2) %>%
add_col_annotation(pbmc_seurat[["seurat_clusters"]][cell_cluster,"seurat_clusters"]) %>%
add_col_barplot(colMeans(GetAssayData(pbmc_seurat, slot = "data")[,cell_cluster]))## |-- Centering matrix.
## |-- Ordering cells.
## |-- Ceiling matrix.
## |-- Flooring matrix.
## |-- Plotting heatmap.